11.a A movie should appear in the dataset at least 18 times. Each has a record for the weekend (Friday, Saturday and Sunday) from the opening weekend to at least 6 weekends later (for the ones kept). The ones dropped were not in theaters for more than 6 weekends.
11.b
#keeping films that aren't dropped
films_used <- films |>
filter(dropped != 1)
11.c
# day when 12 Rounds came in
round_12_date <- as.Date("2009-03-27")
# Define the number of days to add
days_before <- 17984 #number under 12 Rounds "date" column
# Days prior to the
reference_date <- round_12_date - days_before + 1
# Print the new date
print(reference_date)
## [1] "1960-01-01"
11.d
films_used_d <- films_used |>
mutate(movie_date = as.Date(reference_date + date)) |>
#putting the release_date in the 4th column
select(title, production_budget, release_yr,
movie_date, sat_date, everything())
films_used_d[, c("title", "movie_date")]
11.e
#first using sat_date to get the date for each saturday
films_used_date <- films_used_d |>
#getting the day for saturday
mutate(sat_day = reference_date + sat_date) |>
mutate(sat_day_of_week = wday(sat_day, label = TRUE)) |>
mutate(
fri_dummy = ifelse(movie_date == sat_day - 1, 1, 0),
sat_dummy = ifelse(movie_date == sat_day , 1, 0),
#reasoning... there was no movie released on Sunday....
sun_dummy = ifelse(movie_date == sat_day + 1, 1, 0)
) |> arrange(title)
films_used_date[, c("title", "movie_date","sat_day" ,"fri_dummy", "sat_dummy", "sun_dummy")]
11.f
#creating dummies for week using fastDummies
films_used_date <- films_used_date |>
arrange(title, sat_day) |>
group_by(title) |>
# Assign numeric labels to unique elements of sat_date within each title
mutate(week = as.integer(factor(sat_date)))
#Now using fast dummies...
films_used_date <- dummy_cols(films_used_date, select_columns = 'week')
films_used_date[, c("title", "movie_date" ,"week_1", "week_2", "week_3")]
11.g
#using the "Fast Dummies" library... to automatically create dummies for year
film <- dummy_cols(films_used_date, select_columns = 'release_yr')
film[, c("title", "release_yr", "release_yr_2009", "release_yr_2010")]
11.h
#combine the weekends
temp <- film |>
mutate(weekend = case_when(
sat_dummy == 1 ~ "Saturday",
fri_dummy == 1 ~ "Friday",
sun_dummy == 1 ~ "Sunday",
)) |>
group_by(week, weekend) |>
summarize(mean = mean(tickets, na.rm = TRUE))
temp |>
ggplot(aes(x = week, y = mean, color = as.factor(weekend))) +
geom_point() +
geom_line() +
scale_color_manual(values = c("Saturday" = "#4682B4",
"Friday" = "red",
"Sunday" = "#8B008B")) +
labs(color = "Weekend",
y = "Tickets",
x = "Week") +
scale_x_continuous(breaks = scales::pretty_breaks(n = 6)) + # Set x-axis ticks
scale_y_continuous(breaks = scales::pretty_breaks(n = 6)) + # Set y-axis ticks
theme_bw()
NOT NEEDED
#subset colnames that have the hh in them
holiday <- str_subset(colnames(film), "hh")
#make the things in holiday "add"
holiday_dummy <- str_c(holiday, collapse = " + ")
#day of the week dummies
weekend_dummy <- str_c(str_subset(colnames(film), "dummy"), collapse = " + ")
#week of the year dummies
week_dummy <- str_c(str_subset(colnames(film), "week_"), collapse = " + ")
#year of the week dummy
year_dummy <- str_c(str_subset(colnames(film), "release_yr_"), collapse = " + ")
#combine
mod1 <- glue("tickets ~ {weekend_dummy} + {week_dummy} + {year_dummy} + {holiday_dummy}")
#fit a regression model
reg_mod1 <- lm(as.formula(mod1), data = film)
film <- film |>
mutate(pred_tickets = predict(reg_mod1, film)) |>
mutate(abnormal_viewership = tickets - pred_tickets)
film[, c("tickets","pred_tickets", "abnormal_viewership", "sat_day")]
weather <- read_dta("data/weather_collapsed_day.dta")
#adding www to the column names
original_cols <- colnames(weather)
# adding prefix using the paste
colnames(weather) <- paste("www", original_cols, sep = "_")
weather
weather_film <- film |>
left_join(weather |> #all variables except the saturday variable
select(-c("www_sat_date")),
#combine on dates, automatically filters out dates that don't match
by = c("movie_date" = "www_date"))
#test <- weather_film |>
# arrange(title) |>
# select(title, movie_date, sat_day, contains("www"))
# Select columns with names containing "www_"
www_columns <- str_subset(colnames(weather_film), "www_")
# Create a copy of the original dataframe
df <- weather_film
# Define regression formula with dummy variables
regressors <- glue("~ {weekend_dummy} + {week_dummy} + {year_dummy} + {holiday_dummy}")
# Iterate over columns with names containing "www_"
for (columns in www_columns) {
# Construct regression formula
model <- paste(columns, regressors)
# Generate names for predicted values and residuals
pred_name <- paste("pred", columns, sep = "_")
resid_name <- paste("abnormal", columns, sep = "_")
# Add predicted values and residuals to the dataframe
df <- df |>
mutate(!!pred_name := predict(lm(as.formula(model), data = df), df)) |>
#residuals = column - predicted_value_for_column
mutate(!!resid_name := eval(parse(text = columns)) - eval(parse(text = pred_name)))
}
#remove the predicted and original values, keeping only the residuals
new_weather <- df |>
select(-c(contains("pred_www"), starts_with("www")))
#combine
#fit a regression model
week_2_data <- new_weather |>
filter(week_2 == 1)
#using the same regression
reg_mod2 <- lm(as.formula(mod1), data = week_2_data)
new_weather_film_wk2 <- week_2_data |>
mutate(pred_tickets_wk_2 = predict(reg_mod2, week_2_data)) |>
mutate(abnormal_viewership_wk_2 = tickets - pred_tickets_wk_2)
new_weather_film_wk2[, c("tickets", "pred_tickets_wk_2", "week_2", "abnormal_viewership_wk_2")]
#Mak
#subsetting the data to just be week 1
week_1_data <- new_weather |>
filter(week_1 == 1)
#creating the "abnormal viewerships in week 1"------------
mod1 <- glue("tickets ~ {weekend_dummy} + {week_dummy} + {year_dummy} + {holiday_dummy}")
#fit a regression model
reg_mod1 <- lm(as.formula(mod1), data = week_1_data)
new_weather_film_wk1 <- week_1_data |>
mutate(pred_tickets_wk_1 = predict(reg_mod1, week_1_data)) |>
mutate(abnormal_viewership_wk1 = tickets - pred_tickets_wk_1)
17.a OLS;
abnormal_weather_wk1_names <-
str_subset(colnames(new_weather_film_wk1), "abnormal_www")
abnormal_weather_wk1 <-
str_c(abnormal_weather_wk1_names, collapse = "+")
ols_glue <- glue("abnormal_viewership_wk1 ~ {abnormal_weather_wk1}")
ols_mod <- lm(as.formula(ols_glue),
new_weather_film_wk1)
#modelsummary(list(ols_mod), output = "gt")
17.b
#subset the data to include the variables of interest
leaps_data <- new_weather_film_wk1 |>
select(c(abnormal_viewership_wk1, all_of(abnormal_weather_wk1_names)))
forward <- regsubsets(abnormal_viewership_wk1 ~ .,
data = leaps_data, method = "forward")
## Reordering variables and trying again:
# Get summary of the models
summary_forward <- summary(forward)
# Find the index of the model with the highest R-squared Adjusted
best_model_index_fwd <- which.max(summary_forward$adjr2) #9th model has the highest
# Get the names of predictors (coef) in the best model (9), without the intercept([-1])
best_adjr_predictors <- names(coef(forward, id = best_model_index_fwd)[-1])
# Print the selected predictors and the corresponding R-squared Adjusted value
best_adjr_predictors
## [1] "abnormal_www_rain" "abnormal_www_mat5_60" "abnormal_www_mat5_85"
## [4] "abnormal_www_mat5_90" "abnormal_www_prec_1" "abnormal_www_cloud_0"
## [7] "abnormal_www_cloud_4" "abnormal_www_cloud_5" "abnormal_www_cloud_8"
#running regressions based on the model from foward (adj R^2)
regs_fwd <- str_c(best_adjr_predictors, collapse = " + ")
fwd_glue <- glue("abnormal_viewership_wk1 ~ {regs_fwd}")
fwd_mod <- lm(as.formula(fwd_glue), data = new_weather_film_wk1)
17.c
#only show the last steps (trace = 0)
backward <- step(ols_mod, direction = "backward",trace=0)
best_bkwd_predictors <- names(coefficients(backward)[-1])
best_bkwd_predictors
## [1] "abnormal_www_rain" "abnormal_www_mat5_45" "abnormal_www_mat5_55"
## [4] "abnormal_www_mat5_70" "abnormal_www_mat5_75" "abnormal_www_prec_0"
## [7] "abnormal_www_cloud_3" "abnormal_www_cloud_4" "abnormal_www_mat_la"
#running regressions based on the model from backward
regs_bkwd <- str_c(best_bkwd_predictors, collapse = " + ")
bkwd_glue <- glue("abnormal_viewership_wk1 ~ {regs_bkwd}")
bkwd_mod <- lm(as.formula(bkwd_glue), data = new_weather_film_wk1)
17.d
ridge_mod <- cv.glmnet(
x = as.matrix(new_weather_film_wk1 |>
select(all_of(abnormal_weather_wk1_names))),
y = new_weather_film_wk1 |>
pull(abnormal_viewership_wk1), #pull gets the numeric values
alpha = 0, # Ridge penalty
nfolds = 5 # 5 fold cross validation
)
#lambda that minimizes the cross validation error
nfold_ridge <- ridge_mod$lambda.min
coefs_nfold_ridge <- coef(ridge_mod, s = nfold_ridge)
# Find lambda within 1 standard deviation of the minimum lambda (storind estimates for now. Will predict with model later)
nfold_ridge_1se <- ridge_mod$lambda.1se
coefs_nfold_ridge_1se <- coef(ridge_mod, s = nfold_ridge_1se)
17.e
lasso_mod <- cv.glmnet(
x = as.matrix(new_weather_film_wk1 |>
select(all_of(abnormal_weather_wk1_names))),
y = new_weather_film_wk1 |>
pull(abnormal_viewership_wk1), #pull gets the numeric values
alpha = 1, # Lasso penalty
nfolds = 5 # 5 fold cross validation
)
#lambda that minimizes the cross validation error
nfold_lasso = lasso_mod$lambda.min
# Find lambda within 1 standard deviation of the minimum lambda
nfold_lasso_1se <- lasso_mod$lambda.1se
#movies <- read_csv("data/movie_lens_20m/movie.csv")
#ratings <- read_csv("data/movie_lens_20m/rating.csv")
Given that \(Y \mid X, \beta \sim \mathcal{N}\left(X \beta, \sigma^{2} I_{n}\right)\), the density of \(Y\) conditional on \(X\) and \(\beta\) is given by the probability density function (pdf) of the multivariate normal distribution. The pdf for a multivariate normal distribution with mean \(\mu\) and covariance matrix \(\Sigma\) is given by:
\[ f(y; \mu, \Sigma) = \frac{1}{(2\pi)^{n/2}|\Sigma|^{1/2}} \exp\left(-\frac{1}{2}({y}-\mu)^\top \Sigma^{-1} ({y}-\mu)\right) \]
For our specific case, \(\mu = X\beta\) and \(\Sigma = \sigma^2 I_n\), thus the conditional density of \(Y\) given \(X\) and \(\beta\) is:
\[ f(Y \mid X, \beta) = \frac{1}{(2\pi\sigma^2)^{n/2}} \exp\left(-\frac{1}{2\sigma^2}(Y-X\beta)^\top (Y-X\beta)\right) \]
This expression gives the likelihood of observing \(Y\) given the predictors \(X\) and the regression coefficients \(\beta\), under the assumption of normally distributed errors with variance \(\sigma^2\).
Given that \(\beta \sim \mathcal{N}\left(0, \tau^{2} I_{p}\right)\), the density of \(\beta\) is given by the probability density function (pdf) of the multivariate normal distribution, where \(\mu = 0\) (a \(p\)-dimensional zero vector) and \(\Sigma = \tau^2 I_p\) (a \(p \times p\) covariance matrix where the variance of each component of \(\beta\) is \(\tau^2\) and all covariances are 0). The pdf for a multivariate normal distribution with mean \(\mu\) and covariance matrix \(\Sigma\) is:
\[ f(x; \mu, \Sigma) = \frac{1}{(2\pi)^{k/2}|\Sigma|^{1/2}} \exp\left(-\frac{1}{2}(x-\mu)^\top \Sigma^{-1} (x-\mu)\right) \]
In our case, \(k = p\), \(\mu = 0\), and \(\Sigma = \tau^2 I_p\), thus the density of \(\beta\) is:
\[ f(\beta) = \frac{1}{(2\pi\tau^2)^{p/2}} \exp\left(-\frac{1}{2\tau^2}\beta^\top \beta\right) \]
This expression gives the prior belief about the distribution of the regression coefficients \(\beta\), assuming they are drawn from a multivariate normal distribution centered at zero with covariance \(\tau^2 I_p\), indicating a belief in the shrinkage towards zero of the coefficients (consistent with the principles of Ridge Regression).
To show that the posterior likelihood of \(\beta\) given \(Y\) and \(X\) satisfies the given expression, we’ll use the densities for \(f(Y \mid X, \beta)\) and \(f(\beta)\) that we have previously derived and apply Bayes’ theorem.
Given:
\[ f(Y \mid X, \beta) = \frac{1}{(2\pi\sigma^2)^{n/2}} \exp\left(-\frac{1}{2\sigma^2}(Y-X\beta)^\top (Y-X\beta)\right) \]
\[ f(\beta) = \frac{1}{(2\pi\tau^2)^{p/2}} \exp\left(-\frac{1}{2\tau^2}\beta^\top \beta\right) \]
\[ f(\beta \mid Y, X)=\frac{f(Y \mid X, \beta) f(\beta)}{f(Y)} \]
where \(f(Y)\) does not depend on \(\beta\) and serves as a normalizing constant.
Substituting the expressions for \(f(Y \mid X, \beta)\) and \(f(\beta)\) into Bayes’ theorem:
\[ f(\beta \mid Y, X) = \frac{\frac{1}{(2\pi\sigma^2)^{n/2}} \exp\left(-\frac{1}{2\sigma^2}(Y-X\beta)^\top (Y-X\beta)\right) \times \frac{1}{(2\pi\tau^2)^{p/2}} \exp\left(-\frac{1}{2\tau^2}\beta^\top \beta\right)}{f(Y)} \]
Since \(f(Y)\) does not depend on \(\beta\), and noting that the constants involving \((2\pi\sigma^2)^{n/2}\) and \((2\pi\tau^2)^{p/2}\) do not depend on \(\beta\), we can denote all the terms that do not depend on \(\beta\) by a constant \(C'\). Thus, the expression simplifies to:
\[ f(\beta \mid Y, X) = C' \exp\left(-\frac{1}{2\sigma^2}(Y-X\beta)^\top (Y-X\beta) - \frac{1}{2\tau^2}\beta^\top \beta\right) \]
Here, \(C'\) is a constant that may involve factors from the normalizing constant \(f(Y)\) and the constants pulled out of the exponential terms but crucially does not depend on \(\beta\).
Given that \(C\) is defined as a constant that does not depend on \(\beta\), we can express \(C'\) simply as \(C\) for clarity. Therefore, we have shown that:
\[ f(\beta \mid Y, X) = C \exp\left(-\frac{1}{2\sigma^2}(Y-X\beta)^\top (Y-X\beta) - \frac{1}{2\tau^2}\beta^\top \beta\right) \]
This expression aligns with the given statement, demonstrating that the posterior likelihood of \(\beta\) given \(Y\) and \(X\) indeed satisfies the specified form, with \(C\) being a constant that does not depend on \(\beta\).
To conclude that the \(\beta\) which maximizes the posterior likelihood (the mode of the posterior distribution) is given by the solution to the specified optimization problem, we need to analyze the expression for the posterior likelihood derived in the previous question:
\[ f(\beta \mid Y, X) = C \exp \left(-\frac{1}{2 \sigma^{2}}(Y-X \beta)^{t}(Y-X \beta)-\frac{1}{2 \tau^{2}} \beta^{t} \beta\right) \]
Maximizing the posterior likelihood \(f(\beta \mid Y, X)\) with respect to \(\beta\) is equivalent to minimizing the exponent’s argument because the exponential function is monotonically increasing. Thus, we focus on minimizing the negative exponent in the expression, which involves two terms:
The term \(-\frac{1}{2 \sigma^{2}}(Y-X \beta)^{t}(Y-X \beta)\) corresponds to the squared error between the observed and predicted values, scaled by the variance \(\sigma^2\).
The term \(-\frac{1}{2 \tau^{2}} \beta^{t} \beta\) corresponds to the penalty on the size of \(\beta\), scaled by the variance \(\tau^2\).
Dropping the constants and the negative sign, the problem of maximizing the posterior likelihood can be reformulated as minimizing the expression:
\[ \frac{1}{2 \sigma^{2}}(Y-X \beta)^{t}(Y-X \beta) + \frac{1}{2 \tau^{2}} \beta^{t} \beta \]
Multiplying through by \(2\sigma^2\) (which does not change the location of the minimum) to simplify, we obtain:
\[ (Y-X \beta)^{t}(Y-X \beta) + \frac{\sigma^{2}}{\tau^{2}} \beta^{t} \beta \]
Recognizing that \(\beta^{t} \beta = \|\beta\|_{2}^{2}\), the optimization problem becomes:
\[ \min _{\beta}(Y-X \beta)^{t}(Y-X \beta) + \frac{\sigma^{2}}{\tau^{2}}\|\beta\|_{2}^{2} \]
This is precisely the ridge regression optimization problem, where \((Y-X \beta)^{t}(Y-X \beta)\) represents the least squares error term, and \(\frac{\sigma^{2}}{\tau^{2}}\|\beta\|_{2}^{2}\) is the ridge penalty on the size of \(\beta\), with the penalty strength controlled by the ratio \(\frac{\sigma^{2}}{\tau^{2}}\). Thus, the \(\beta\) that maximizes the posterior likelihood under the Bayesian interpretation is indeed given by the solution to this optimization problem, highlighting the connection between Bayesian inference and ridge regression.
In Bayesian inference, we begin with a prior belief about the parameters (in this case, \(\beta\)) of our linear model. This belief is quantified by assuming that \(\beta\) follows a normal distribution centered around zero, with its dispersion controlled by \(\tau^2\). This assumption reflects our expectation that, a priori, the true values of the regression coefficients are likely to be small or close to zero, enforcing a form of regularization even before observing the data.
When we observe the data (represented by \(Y\) and \(X\)), we update our beliefs about \(\beta\) by calculating the posterior distribution, which combines our prior belief and the likelihood of observing the data given \(\beta\). The likelihood of the data given \(\beta\) is also modeled using a normal distribution, where the variance \(\sigma^2\) captures the uncertainty or noise in the observations.
Maximizing the posterior likelihood of \(\beta\), or equivalently, finding the mode of the posterior distribution, involves adjusting our estimate of \(\beta\) to best fit the observed data while also adhering to our initial regularization imposed by the prior. This balance is captured mathematically by an optimization problem that seeks to minimize the sum of two terms: one representing the fit of the model to the data (the least squares error term) and the other representing the size of the coefficients (the ridge penalty term), weighted by the ratio of the variance of the observation noise to the variance of the prior on \(\beta\) (\(\frac{\sigma^2}{\tau^2}\)).
This optimization problem is precisely that of ridge regression, which seeks to find regression coefficients that not only fit the data well but are also small in magnitude, to prevent overfitting and improve the generalizability of the model. The strength of the regularization is controlled by the ratio \(\frac{\sigma^2}{\tau^2}\), linking the Bayesian interpretation directly to the regularization parameter in ridge regression. This parameter balance reflects the trade-off between fitting the data closely (minimizing prediction error) and keeping the model simple and robust to new data (minimizing the size of the coefficients).
In summary, the result elegantly bridges the concepts of Bayesian inference and regularization in regression analysis. It shows that the process of updating our beliefs about the model parameters in light of new data, within a Bayesian framework, naturally leads to a form of regularization that is mathematically equivalent to ridge regression. This connection underscores the interpretability and theoretical underpinning of regularization techniques in machine learning and statistics, providing a principled way to incorporate prior knowledge and manage model complexity.